#Alexander F. Gazmararian
#afg2@princeton.edu
#January 9, 2024

# Load packages
library(tidyverse)
library(tidylog)
library(modelsummary)
library(kableExtra)
library(broom)
library(here)

# Load custom functions
source("code/fun/digitsByRows.r")

#Load data
# Load data
g <- readRDS("data/output/pres_out.rds")
#Load matched data
g.nn <- readRDS("data/output/matched_df.rds")

# Table C.1: Summary Statistics
# Demographics
dem <- g %>%
  rename(workfemale = work_female,
         ruralnonfarm = rural_nonfarm) %>%
  group_by(sample,fips) %>%
  summarise(
    across(
      c(white, hispanic, foreign, college, income99pclog, poverty, ruralnonfarm, poplog, under40, debt06, workfemale),
      .fns = list("mean"=~mean(.x,na.rm=TRUE),"sd"=~sd(.x,na.rm=TRUE),"na"=~sum(is.na(.x),na.rm=TRUE))
    )
  ) %>%
  pivot_longer(cols = c(white_mean:workfemale_na)) %>%
  separate(col=name,into=c("var","stat"), sep="_") %>%
  pivot_wider(names_from=c(sample,stat)) %>%
  group_by(var) %>%
  summarise(
    across(contains(c("mean","sd")), ~ mean(.x,na.rm=TRUE)),
    across(contains("na"), ~ sum(.x,na.rm=TRUE))
  )
#Time-varying covariates
tvar <- g %>%
  rename(oilgasprop = oilgas_prop,
         coalprop = coal_prop,
         powerprop = power_prop) %>%
  group_by(sample) %>%
  summarise(
    across(
      c(oilgasprop,coalprop,powerprop,RepVotesMajorPercent),
      .fns = list("mean"=~mean(.x,na.rm=TRUE),"sd"=~sd(.x,na.rm=TRUE),"na"=~sum(is.na(.x),na.rm=TRUE))
    )
  ) %>%
  pivot_longer(cols = c(oilgasprop_mean:RepVotesMajorPercent_na)) %>%
  separate(col=name,into=c("var","stat"), sep="_") %>%
  pivot_wider(names_from=c(sample,stat))
#Distance to gas plants
gasdist <- g %>%
  filter(year>=1992) %>%
  rename(gasdist=NewGasDistance_s) %>%
  group_by(sample) %>%
  summarise(
    across(
      c(gasdist),
      .fns = list("mean"=~mean(.x,na.rm=TRUE),"sd"=~sd(.x,na.rm=TRUE),"na"=~sum(is.na(.x),na.rm=TRUE))
    )
  ) %>%
  pivot_longer(cols = c(gasdist_mean:gasdist_na)) %>%
  separate(col=name,into=c("var","stat"), sep="_") %>%
  pivot_wider(names_from=c(sample,stat))
#total number of observations
D_coal <- subset(g,sample=="Coal",select=county_state) %>% n_distinct()
N_coal <- subset(g,sample=="Coal") %>% nrow()
D_match <- subset(g,sample=="Matching Pool",select=county_state) %>% n_distinct()
N_match <- subset(g,sample=="Matching Pool") %>% nrow()
D_rest <- subset(g,sample=="Rest of Country",select=county_state) %>% n_distinct()
N_rest <- subset(g,sample=="Rest of Country") %>% nrow()
N_info <-
  data.frame(
    var = c("Units", "Total $N$"),
    Coal_mean = c(D_coal, N_coal),
    Coal_sd = c(NA,NA),
    Coal_na = c(NA,NA),
    `Matching Pool_mean` = c(D_match,N_match),
    `Matching Pool_sd` = c(NA,NA),
    `Matching Pool_na` = c(NA,NA),
    `Rest of Country_mean` = c(D_rest,N_rest),
    `Rest of Country_sd` = c(NA,NA),
    `Rest of Country_na` = c(NA,NA)
  )
names(dem) <- gsub(" ", ".", names(dem))
names(tvar) <- gsub(" ", ".", names(tvar))
names(gasdist) <- gsub(" ", ".", names(gasdist))
# Rearrange dem to match tvar and n_info
dem <- dem[c(10,4,3,1,5,7,8,6,9,2,11), c(1,2,5,8,3,6,9,4,7,10)]
sumstat <-
  rbind(
    digitsByRows(dem[,c(2:10)], 2),
    digitsByRows(gasdist[,c(2:10)], 2),
    digitsByRows(tvar[,c(2:10)], 2),
    digitsByRows(N_info[,c(2:10)], 0)
  )
# Bind rows together
sumstat$var <- c("White", "Hispanic", "Foreign-born", "College", "Income per capita (log)",
                 "Poverty", "Rural", "Population (log)", "Under 40 years", "Debt-to-income",
                 "Female workforce", "Gas plant distance (1992--2020)", "Hydraulic fracturing employment", "Coal employment",
                 "Power employment", "Two-party Republican vote share", "Counties", "$N$")
sumstat <- sumstat[,c(10,1:9)]
names(sumstat) <- c("", "Mean", "SD", "NA", "Mean", "SD", "NA", "Mean", "SD", "NA")
sumstat[1:16, 4] <- t(digitsByRows(as.numeric(sumstat[1:16,4]), 0))
sumstat[1:16, 7] <- t(digitsByRows(as.numeric(sumstat[1:16,7]), 0))
sumstat[1:16, 10] <- t(digitsByRows(as.numeric(sumstat[1:16,10]), 0))
# Create table
kbl(sumstat, booktabs = TRUE, escape = FALSE, format = "latex", linesep = "", digits=2) %>%
  add_header_above(c(" "=1, "Coal"=3,"Matching Pool"=3,"Rest of Country"=3)) %>%
  group_rows("Pretreatment:", 1, 11) %>%
  group_rows("Time-Varying:", 12, 16) %>%
  group_rows("Observations:", 17, 18) %>%
  cat(., file = here("output", "tables", "si_tab_C1_sumstats.txt"))
